@inproceedings{wang2012understanding,
  author    = {Jingjing Wang and
               Haixun Wang and
               Zhongyuan Wang and
               Kenny Qili Zhu},
  title     = {Understanding Tables on the Web},
  booktitle = {ER},
  year      = {2012},
  pages     = {141-155},
}

@INPROCEEDINGS{gupta11,
  AUTHOR = {Rahul Gupta and Sunita Sarawagi},
  TITLE = {Joint Training for Open-domain Extraction on the Web: Exploiting Overlap when Supervision is Limited},
  BOOKTITLE = {WSDM},
  YEAR = 2011,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/wsdm11.pdf},
  ABSTRACT = {
We consider the problem of jointly training structured mod-
els for extraction from multiple web sources whose records
enjoy partial content overlap. This has important applica-
tions in open-domain extraction, e.g. a user materializing a
table of interest from multiple relevant unstructured sources;
or a site like Freebase augmenting an incomplete relation by
extracting more rows from web sources. Such applications
require extraction over arbitrary domains, so one cannot use
a pre-trained extractor or demand a huge labeled dataset.
We propose to overcome this lack of supervision by using
content overlap across the related web sources. Existing
methods of exploiting overlap have been developed under
settings that do not generalize easily to the scale and diver-
sity of overlap seen on Web sources.
   We present an agreement-based learning framework that
jointly trains the models by biasing them to agree on the
agreement regions, i.e. shared text segments. We present
alternatives within our framework to trade-off tractability,
robustness to noise, and extent of agreement enforced; and
propose a scheme of partitioning agreement regions that
leads to efficient training while maximizing overall accuracy.
Further, we present a principled scheme to discover low-noise
agreement regions in unlabeled data across multiple sources.
   Through extensive experiments over 58 different extrac-
tion domains, we establish that our framework provides sig-
nificant boosts over uncoupled training, and scores over al-
ternatives such as collective inference, staged training, and
multi-view learning.
}
}


@article{cafarella2011structured,
  title={Structured data on the web},
  author={Cafarella, M.J. and Halevy, A. and Madhavan, J.},
  journal={Communications of the ACM},
  volume={54},
  number={2},
  pages={72--79},
  year={2011},
  publisher={ACM}
}

@inproceedings{bergroth2000survey,
  title={A survey of longest common subsequence algorithms},
  author={Bergroth, L. and Hakonen, H. and Raita, T.},
  booktitle={String Processing and Information Retrieval, 2000. SPIRE 2000. Proceedings. Seventh International Symposium on},
  pages={39--48},
  year={2000},
  organization={IEEE}
}

@INPROCEEDINGS{pimplikar12,
  AUTHOR = {Rakesh Pimplikar and Sunita Sarawagi},
  TITLE = {Answering Table Queries on the Web using Column Keywords},
  BOOKTITLE = {Proc. of the 38th Int'l Conference on Very Large Databases (VLDB)},
  YEAR = 2012,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/vldb2012.pdf},
  ABSTRACT = {We present the design of a structured search engine which returns a
multi-column table in response to a query consisting of keywords
describing each of its columns.  We answer such queries by exploiting
the millions of tables on the Web because these are much richer
sources of structured knowledge than free-format text.
However, a corpus of tables harvested from arbitrary HTML webpages
presents huge challenges of diversity and redundancy not seen in
centrally edited knowledge bases.
We concentrate on one concrete task in this paper.  Given a set of Web
tables $T_1,\ldots,T_n$, and a query $Q$ with $q$ sets of keywords
$Q_1,\ldots,Q_q$, decide for each $T_i$ if it is relevant to $Q$ and
if so, identify the mapping between the columns of $T_i$ and query
columns.  
We represent this task as a graphical model that jointly maps all
tables by incorporating diverse sources of clues spanning matches in
different parts of the table, corpus-wide co-occurrence statistics,
and content overlap across table columns.  We define a novel query
segmentation model for matching keywords to table columns, and a robust
mechanism of exploiting content overlap across table columns.  We
design efficient inference algorithms based on bipartite matching and
constrained graph cuts to solve the joint labeling task.  Experiments
on a workload of 59 queries over a 25 million web table corpus shows
significant boost in accuracy over baseline IR methods.
}
}

@inproceedings{pasca2008weakly,
  title={Weakly-supervised acquisition of open-domain classes and class attributes from web documents and query logs},
  author={Pasca, M. and Van Durme, B.},
  booktitle={Proceedings of the 46th Annual Meeting of the Association for Computational Linguistics (ACL-08)},
  pages={19--27},
  year={2008}
}

@article{limaye2010annotating,
  title={Annotating and searching web tables using entities, types and relationships},
  author={Limaye, G. and Sarawagi, S. and Chakrabarti, S.},
  journal={Proceedings of the VLDB Endowment},
  volume={3},
  number={1-2},
  pages={1338--1347},
  year={2010},
  publisher={VLDB Endowment}
}

@inproceedings{gatterbauer2006table,
  title={Table extraction using spatial reasoning on the CSS2 visual box model},
  author={Gatterbauer, W. and Bohunsky, P.},
  booktitle={PROCEEDINGS OF THE NATIONAL CONFERENCE ON ARTIFICIAL INTELLIGENCE},
  volume={21},
  number={2},
  pages={1313},
  year={2006},
  organization={Menlo Park, CA; Cambridge, MA; London; AAAI Press; MIT Press; 1999}
}

@article{cafarella2009data,
  title={Data integration for the relational web},
  author={Cafarella, M.J. and Halevy, A. and Khoussainova, N.},
  journal={Proceedings of the VLDB Endowment},
  volume={2},
  number={1},
  pages={1090--1101},
  year={2009},
  publisher={VLDB Endowment}
}

@inproceedings{das2012finding,
  title={Finding related tables},
  author={Das Sarma, A. and Fang, L. and Gupta, N. and Halevy, A. and Lee, H. and Wu, F. and Xin, R. and Yu, C.},
  booktitle={Proceedings of the 2012 international conference on Management of Data},
  pages={817--828},
  year={2012},
  organization={ACM}
}

@article{venetis2011recovering,
  title={Recovering semantics of tables on the web},
  author={Venetis, P. and Halevy, A. and Madhavan, J. and Pa{\c{s}}ca, M. and Shen, W. and Wu, F. and Miao, G. and Wu, C.},
  journal={Proceedings of the VLDB Endowment},
  volume={4},
  number={9},
  pages={528--538},
  year={2011},
  publisher={VLDB Endowment}
}

@article{cafarella2008uncovering,
  title={Uncovering the relational web},
  author={Cafarella, M.J. and Halevy, A.Y. and Zhang, Y. and Wang, D.Z. and Wu, E.},
  journal={WebDB},
  year={2008},
  publisher={Citeseer}
}

@article{cafarella2008webtables,
  title={Webtables: exploring the power of tables on the web},
  author={Cafarella, M.J. and Halevy, A. and Wang, D.Z. and Wu, E. and Zhang, Y.},
  journal={Proceedings of the VLDB Endowment},
  volume={1},
  number={1},
  pages={538--549},
  year={2008},
  publisher={VLDB Endowment}
}

@inproceedings{liu2003mining,
  title={Mining data records in Web pages},
  author={Liu, B. and Grossman, R. and Zhai, Y.},
  booktitle={Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining},
  pages={601--606},
  year={2003},
  organization={ACM}
}

@article{elmeleegy2009harvesting,
  title={Harvesting relational tables from lists on the web},
  author={Elmeleegy, H. and Madhavan, J. and Halevy, A.},
  journal={Proceedings of the VLDB Endowment},
  volume={2},
  number={1},
  pages={1078--1089},
  year={2009},
  publisher={VLDB Endowment}
}

@article{cafarella2011structured,
  title={Structured data on the web},
  author={Cafarella, M.J. and Halevy, A. and Madhavan, J.},
  journal={Communications of the ACM},
  volume={54},
  number={2},
  pages={72--79},
  year={2011},
  publisher={ACM}
}

@book{gusfield1997algorithms,
  title={Algorithms on strings, trees and sequences: computer science and computational biology},
  author={Gusfield, D.},
  year={1997},
  publisher={Cambridge University Press}
}

@inproceedings{barzilay2003learning,
  title={Learning to paraphrase: An unsupervised approach using multiple-sequence alignment},
  author={Barzilay, R. and Lee, L.},
  booktitle={Proceedings of the 2003 Conference of the North American Chapter of the Association for Computational Linguistics on Human Language Technology-Volume 1},
  pages={16--23},
  year={2003},
  organization={Association for Computational Linguistics}
}

@inproceedings{barzilay2002bootstrapping,
  title={Bootstrapping lexical choice via multiple-sequence alignment},
  author={Barzilay, R. and Lee, L.},
  booktitle={Proceedings of the ACL-02 conference on Empirical methods in natural language processing-Volume 10},
  pages={164--171},
  year={2002},
  organization={Association for Computational Linguistics}
}

@article{lafferty2001conditional,
  title={Conditional random fields: Probabilistic models for segmenting and labeling sequence data},
  author={Lafferty, J. and McCallum, A. and Pereira, F.C.N.},
  year={2001}
}

@inproceedings{milne2008learning,
  title={Learning to link with wikipedia},
  author={Milne, D. and Witten, I.H.},
  booktitle={Proceedings of the 17th ACM conference on Information and knowledge management},
  pages={509--518},
  year={2008},
  organization={ACM}
}

@inproceedings{ratinov2011local,
  title={Local and global algorithms for disambiguation to wikipedia},
  author={Ratinov, L. and Roth, D. and Downey, D. and Anderson, M.},
  booktitle={Proceedings of the Annual Meeting of the Association of Computational Linguistics (ACL)},
  year={2011}
}

@article{wang1994complexity,
  title={On the complexity of multiple sequence alignment},
  author={Wang, L. and Jiang, T.},
  journal={Journal of computational biology},
  volume={1},
  number={4},
  pages={337--348},
  year={1994}
}